import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import numpy as np
from scipy import stats
import sys
sys.path.append(sys.argv[1])

import pandas as pd  
import pickle
  
# Load the dataset  
credit_customers = pd.read_csv(os.path.join(sys.argv[1], 'credit_customers.csv'))  
  
# Check for missing values  
missing_values = credit_customers.isnull().sum()  

print("Missing values per column:\n", missing_values)    
# pickle.dump(missing_values,open("./ref_result/missing_values.pkl","wb"))


import pandas as pd  
import pickle 
  
# Load the dataset  
  
categorical_columns = credit_customers.select_dtypes(include=['object']).columns

print("categorical_columns:\n", categorical_columns)    
# pickle.dump(categorical_columns,open("./ref_result/categorical_columns.pkl","wb"))

import pandas as pd  
import pickle 
  
# Load the dataset  
  
# Initialize an empty list to store nominal categorical columns   
nominal_categorical_columns = []   

# Iterate through the categorical columns and append them to the nominal_categorical_columns list   
for col in categorical_columns:   
    nominal_categorical_columns.append(col)   

print("nominal_categorical_columns:\n", nominal_categorical_columns)    
# pickle.dump(nominal_categorical_columns,open("./ref_result/nominal_categorical_columns.pkl","wb"))

import pandas as pd  
import pickle 
  
# Load the dataset  
  
# Suggest encoding methods   
print("Suggested encoding methods:")   
print("1. One-hot encoding for nominal categorical variables:")   
for col in nominal_categorical_columns:   
    print("   -", col)   

print("nominal_categorical_columns:\n", nominal_categorical_columns)    
# pickle.dump(nominal_categorical_columns,open("./ref_result/nominal_categorical_columns.pkl","wb"))

import pandas as pd  
import pickle 
  
# Load the dataset  
  
credit_customers_encoded = pd.get_dummies(credit_customers, columns=nominal_categorical_columns)  

print("credit_customers_encoded:\n", credit_customers_encoded)    
# pickle.dump(credit_customers_encoded,open("./ref_result/credit_customers_encoded.pkl","wb"))

import pandas as pd  
import pickle
  
# Load the dataset  
  
# Identify numerical variables  
numerical_columns = credit_customers.select_dtypes(include=['number']).columns 
  
print("numerical_columns:\n", numerical_columns)  
# pickle.dump(numerical_columns,open("./ref_result/numerical_columns.pkl","wb"))


import pandas as pd  
import pickle
  
# Load the dataset  
  
ranges = {}   
for col in numerical_columns:   
    ranges[col] = credit_customers[col].max() - credit_customers[col].min()  
  
print("ranges:\n", ranges)  
# pickle.dump(ranges,open("./ref_result/ranges.pkl","wb"))


import pandas as pd  
import pickle
  
# Load the dataset  
  
# Check if normalization is necessary   
normalization_needed = False   
for col, range_ in ranges.items():   
    if range_ > 1:   
        normalization_needed = True   
        break 
  
print("normalization_needed:\n", normalization_needed)  
# pickle.dump(normalization_needed,open("./ref_result/normalization_needed.pkl","wb"))


import pandas as pd  
from sklearn.preprocessing import StandardScaler   
import pickle
  
# Load the dataset  
  
# Normalize the data using Standard Scaling   
if normalization_needed:   
    scaler = StandardScaler()   
    credit_customers_normalized = credit_customers.copy()   
    credit_customers_normalized[numerical_columns] = scaler.fit_transform(credit_customers[numerical_columns])   
    print("Data after Standard Scaling:\n", credit_customers_normalized.head())   

else:   
    print("Normalization is not needed.")   

if normalization_needed:   
    print("credit_customers_normalized:\n", credit_customers_normalized)  
    # pickle.dump(credit_customers_normalized,open("./ref_result/credit_customers_normalized.pkl","wb"))


import pandas as pd  
import pickle 
  
# Load the dataset  
  
important_columns = ['credit_history', 'age', 'employment', 'credit_amount', 'savings_status'] 
data_for_clustering = credit_customers[important_columns].copy() 

print("data_for_clustering:\n", data_for_clustering)    
# pickle.dump(data_for_clustering,open("./ref_result/data_for_clustering.pkl","wb"))

import pandas as pd  
from sklearn.preprocessing import LabelEncoder 
import pickle 
  
# Load the dataset  
  
le_savings_status = LabelEncoder() 
le_employment = LabelEncoder() 
data_for_clustering['savings_status'] = le_savings_status.fit_transform(data_for_clustering['savings_status']) 
data_for_clustering['employment'] = le_employment.fit_transform(data_for_clustering['employment']) 

print("data_for_clustering:\n", data_for_clustering)    
# pickle.dump(data_for_clustering,open("./ref_result/data_for_clustering.pkl","wb"))

import pandas as pd  
import pickle 
  
# Load the dataset  
  
data_for_clustering = pd.get_dummies(data_for_clustering, columns=['credit_history'], drop_first=True) 

print("data_for_clustering:\n", data_for_clustering)    
# pickle.dump(data_for_clustering,open("./ref_result/data_for_clustering.pkl","wb"))

import pandas as pd  
from sklearn.preprocessing import StandardScaler
import pickle 
  
# Load the dataset  
  
scaler = StandardScaler() 
data_for_clustering_scaled = scaler.fit_transform(data_for_clustering) 

print("data_for_clustering_scaled:\n", data_for_clustering_scaled)    
# pickle.dump(data_for_clustering_scaled,open("./ref_result/data_for_clustering_scaled.pkl","wb"))

import pandas as pd  
from sklearn.cluster import KMeans 
from sklearn.metrics import silhouette_score 
import matplotlib.pyplot as plt
  
# Load the dataset  
  
silhouette_scores = [] 
for cluster_num in range(2, 15):  # Starting from 2 because silhouette score is not defined for 1 cluster 
    kmeans = KMeans(n_clusters=cluster_num, random_state=42) 
    cluster_labels = kmeans.fit_predict(data_for_clustering_scaled) 
    silhouette_avg = silhouette_score(data_for_clustering_scaled, cluster_labels) 
    silhouette_scores.append(silhouette_avg) 

# Plot silhouette scores 
plt.figure(figsize=(10, 6)) 
plt.plot(range(2, 15), silhouette_scores, marker='o', linestyle='--') 
plt.title('Silhouette Scores for Different Number of Clusters') 
plt.xlabel('Number of Clusters') 
plt.ylabel('Silhouette Score') 
plt.grid(True) 
plt.savefig('ref_result/Silhouette_Scores.png') 
# plt.show() 

import pandas as pd  
import pickle 
  
# Load the dataset  
  
important_columns = ['credit_history', 'age', 'employment', 'credit_amount', 'savings_status']   
data_for_clustering = credit_customers[important_columns].copy()   

print("data_for_clustering:\n", data_for_clustering)    
# pickle.dump(data_for_clustering,open("./ref_result/data_for_clustering.pkl","wb"))

import pandas as pd  
from sklearn.preprocessing import LabelEncoder 
import pickle 
  
# Load the dataset  
  
data_for_clustering['savings_status'] = LabelEncoder().fit_transform(data_for_clustering['savings_status'])   
data_for_clustering['employment'] = LabelEncoder().fit_transform(data_for_clustering['employment'])   
data_for_clustering = pd.get_dummies(data_for_clustering, columns=['credit_history'], drop_first=True)   


print("data_for_clustering:\n", data_for_clustering)    
# pickle.dump(data_for_clustering,open("./ref_result/data_for_clustering.pkl","wb"))

import pandas as pd  
from sklearn.preprocessing import StandardScaler 
import pickle 
  
# Load the dataset  

data_for_clustering_scaled = StandardScaler().fit_transform(data_for_clustering)  

print("data_for_clustering_scaled:\n", data_for_clustering_scaled)    
# pickle.dump(data_for_clustering_scaled,open("./ref_result/data_for_clustering_scaled.pkl","wb"))

import pandas as pd  
from sklearn.cluster import KMeans 
import pickle 
  
# Load the dataset  

kmeans = KMeans(n_clusters=4, random_state=42)   
cluster_labels = kmeans.fit_predict(data_for_clustering_scaled) 

print("cluster_labels:\n", cluster_labels)    
# pickle.dump(cluster_labels,open("./ref_result/cluster_labels.pkl","wb"))

import pandas as pd  
import pickle 
  
# Load the dataset  

credit_customers['cluster'] = cluster_labels 

print("credit_customers:\n", credit_customers)    
# pickle.dump(credit_customers,open("./ref_result/credit_customers.pkl","wb"))

import pandas as pd  
import pickle 
  
# Load the dataset  

result = ("K-means", {"n_clusters": 4, "random_state": 42}, credit_customers[['credit_history', 'age', 'employment', 'credit_amount', 'savings_status', 'cluster']])   

print("result:\n", result)    
# pickle.dump(result,open("./ref_result/result.pkl","wb"))

import pandas as pd  
import pickle 
  
# Load the dataset  
  
important_columns = ['credit_history', 'age', 'employment', 'credit_amount', 'savings_status']   
data_for_clustering = credit_customers[important_columns].copy() 

print("data_for_clustering:\n", data_for_clustering)    
# pickle.dump(data_for_clustering,open("./ref_result/data_for_clustering.pkl","wb"))

import pandas as pd  
from sklearn.preprocessing import LabelEncoder 
import pickle 
  
# Load the dataset  
  
data_for_clustering['savings_status'] = LabelEncoder().fit_transform(data_for_clustering['savings_status'])   
data_for_clustering['employment'] = LabelEncoder().fit_transform(data_for_clustering['employment'])   


print("data_for_clustering:\n", data_for_clustering)    
# pickle.dump(data_for_clustering,open("./ref_result/data_for_clustering.pkl","wb"))

import pandas as pd  
import pickle 
  
# Load the dataset  

data_for_clustering = pd.get_dummies(data_for_clustering, columns=['credit_history'], drop_first=True) 

print("data_for_clustering:\n", data_for_clustering)    
# pickle.dump(data_for_clustering,open("./ref_result/data_for_clustering.pkl","wb"))

import pandas as pd  
from sklearn.preprocessing import StandardScaler 
import pickle 
  
# Load the dataset  

data_for_clustering_scaled = StandardScaler().fit_transform(data_for_clustering)

print("data_for_clustering_scaled:\n", data_for_clustering_scaled)    
# pickle.dump(data_for_clustering_scaled,open("./ref_result/data_for_clustering_scaled.pkl","wb"))

import pandas as pd  
from sklearn.cluster import KMeans 
import pickle 
  
# Load the dataset  

kmeans = KMeans(n_clusters=4, random_state=42)   
cluster_labels = kmeans.fit_predict(data_for_clustering_scaled) 

print("cluster_labels:\n", cluster_labels)    
# pickle.dump(cluster_labels,open("./ref_result/cluster_labels.pkl","wb"))

import pandas as pd  
import pickle 
  
# Load the dataset  

credit_customers['cluster'] = cluster_labels 

print("credit_customers:\n", credit_customers)    
# pickle.dump(credit_customers,open("./ref_result/credit_customers.pkl","wb"))

import pandas as pd  
import pickle 
  
# Load the dataset  

good_credit_history = credit_customers['credit_history'].isin(['existing paid', 'all paid'])     
age_group = (credit_customers['age'] >= 25) & (credit_customers['age'] <= 45)     
stable_employment = credit_customers['employment'].isin(['>=7', '4<=X<7'])    
target_customers = credit_customers[good_credit_history & age_group & stable_employment] 

print("target_customers:\n", target_customers)    
# pickle.dump(target_customers,open("./ref_result/target_customers.pkl","wb"))



import pandas as pd  
import pickle 
  
# Load the dataset  

target_customer_segments = target_customers['cluster'].unique().tolist()

print("target_customer_segments:\n", target_customer_segments)    
pickle.dump(target_customer_segments,open("./ref_result/target_customer_segments.pkl","wb"))